from octis.preprocessing.preprocessing import Preprocessing

stopwords = [line.strip() for line in open('./english.txt' ,'r', encoding='utf-8').readlines()]

def preprocessing_english_stops_split():
    texts_path = "../data/wos_mat_texts.txt"
    p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=True,
                      lemmatize=False, stopword_list=stopwords, split=False,
                      min_chars=3, min_words_docs=5,min_df=0.005, max_df=1.0)
    dataset = p.preprocess_dataset(
        documents_path=texts_path,
    )

    dataset.save("../preprocessed_datasets/wos_mat/")


preprocessing_english_stops_split()